from dataset.corpus_2014 import Corpus2014Dataset
import pandas as pd
import numpy as np

ds = Corpus2014Dataset(nltk_style=True)
train_data = ds.train_data
test_data = ds.test_data
train_x, trainy = Corpus2014Dataset.features_and_labels(train_data)
# test_x, testy = Corpus2014Dataset.features_and_labels(test_data)
data_lens = []
for item in train_x:
    data_lens.append(len(item))

print(np.mean(data_lens))
print(min(data_lens))
print(max(data_lens))
print(np.median(data_lens))

'''
86.05386570253347
2
1540
75.0
'''

# 1621

# data = pd.DataFrame(train_x,columns=['text'])
# data['text_len'] = data['text'].map(lambda x:len(x))
# print(data['text_len'].describe())